This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
# Step 1: Load the dataset
crime_data <- read.csv("Crimes_-_2001_to_Present_20241205.csv")
summary(crime_data)
ID Case.Number Date Block IUCR Primary.Type Description
Min. : 634 Length:8212899 Length:8212899 Length:8212899 Length:8212899 Length:8212899 Length:8212899
1st Qu.: 3966344 Class :character Class :character Class :character Class :character Class :character Class :character
Median : 7352976 Mode :character Mode :character Mode :character Mode :character Mode :character Mode :character
Mean : 7355141
3rd Qu.:10667728
Max. :13681835
Location.Description Arrest Domestic Beat District Ward Community.Area
Length:8212899 Length:8212899 Length:8212899 Min. : 111 Min. : 1.0 Min. : 1.0 Min. : 0.0
Class :character Class :character Class :character 1st Qu.: 621 1st Qu.: 6.0 1st Qu.:10.0 1st Qu.:23.0
Mode :character Mode :character Mode :character Median :1034 Median :10.0 Median :23.0 Median :32.0
Mean :1184 Mean :11.3 Mean :22.8 Mean :37.4
3rd Qu.:1731 3rd Qu.:17.0 3rd Qu.:34.0 3rd Qu.:56.0
Max. :2535 Max. :31.0 Max. :50.0 Max. :77.0
NA's :47 NA's :614830 NA's :613456
FBI.Code X.Coordinate Y.Coordinate Year Updated.On Latitude Longitude
Length:8212899 Min. : 0 Min. : 0 Min. :2001 Length:8212899 Min. :36.62 Min. :-91.69
Class :character 1st Qu.:1153039 1st Qu.:1859167 1st Qu.:2005 Class :character 1st Qu.:41.77 1st Qu.:-87.71
Mode :character Median :1166176 Median :1890924 Median :2010 Mode :character Median :41.86 Median :-87.67
Mean :1164632 Mean :1885878 Mean :2011 Mean :41.84 Mean :-87.67
3rd Qu.:1176391 3rd Qu.:1909391 3rd Qu.:2016 3rd Qu.:41.91 3rd Qu.:-87.63
Max. :1205119 Max. :1951622 Max. :2024 Max. :42.02 Max. :-87.52
NA's :90296 NA's :90296 NA's :90296 NA's :90296
Location
Length:8212899
Class :character
Mode :character
head(crime_data$Date)
[1] "03/18/2015 12:00:00 PM" "12/20/2018 03:00:00 PM" "05/01/2016 12:25:00 AM" "12/20/2018 04:00:00 PM" "06/01/2014 12:01:00 AM"
[6] "09/01/2018 12:01:00 AM"
nrow(crime_data)
[1] 8212899
limited_crime_data = crime_data
head(limited_crime_data$Date)
[1] "03/18/2015 12:00:00 PM" "12/20/2018 03:00:00 PM" "05/01/2016 12:25:00 AM" "12/20/2018 04:00:00 PM" "06/01/2014 12:01:00 AM"
[6] "09/01/2018 12:01:00 AM"
limited_crime_data$Date <- as.Date(limited_crime_data$Date, format = "%m/%d/%Y")
filtered_data <- limited_crime_data %>%
filter(between(Date, as.Date("2001-01-01"),
as.Date("2015-04-01")))
summary(filtered_data)
ID Case.Number Date Block IUCR Primary.Type Description
Min. : 634 Length:5779677 Min. :2001-01-01 Length:5779677 Length:5779677 Length:5779677 Length:5779677
1st Qu.: 3123052 Class :character 1st Qu.:2003-12-28 Class :character Class :character Class :character Class :character
Median : 5392262 Mode :character Median :2007-03-08 Mode :character Mode :character Mode :character Mode :character
Mean : 5463237 Mean :2007-06-09
3rd Qu.: 7713084 3rd Qu.:2010-09-06
Max. :13679478 Max. :2015-04-01
Location.Description Arrest Domestic Beat District Ward Community.Area
Length:5779677 Length:5779677 Length:5779677 Min. : 111 Min. : 1.00 Min. : 1.0 Min. : 0.0
Class :character Class :character Class :character 1st Qu.: 623 1st Qu.: 6.00 1st Qu.:10.0 1st Qu.:23.0
Mode :character Mode :character Mode :character Median :1111 Median :10.00 Median :22.0 Median :32.0
Mean :1200 Mean :11.32 Mean :22.6 Mean :37.7
3rd Qu.:1732 3rd Qu.:17.00 3rd Qu.:34.0 3rd Qu.:58.0
Max. :2535 Max. :31.00 Max. :50.0 Max. :77.0
NA's :46 NA's :614776 NA's :613452
FBI.Code X.Coordinate Y.Coordinate Year Updated.On Latitude Longitude
Length:5779677 Min. : 0 Min. : 0 Min. :2001 Length:5779677 Min. :36.62 Min. :-91.69
Class :character 1st Qu.:1152920 1st Qu.:1859151 1st Qu.:2003 Class :character 1st Qu.:41.77 1st Qu.:-87.71
Mode :character Median :1165913 Median :1890081 Median :2007 Mode :character Median :41.85 Median :-87.67
Mean :1164477 Mean :1885617 Mean :2007 Mean :41.84 Mean :-87.67
3rd Qu.:1176336 3rd Qu.:1909421 3rd Qu.:2010 3rd Qu.:41.91 3rd Qu.:-87.63
Max. :1205119 Max. :1951622 Max. :2015 Max. :42.02 Max. :-87.52
NA's :53228 NA's :53228 NA's :53228 NA's :53228
Location
Length:5779677
Class :character
Mode :character
nrow(filtered_data)
[1] 5779677
head(filtered_data)
filtered_data <-na.omit(filtered_data)
filtered_data$Date <- as.Date(filtered_data$Date, format = "%Y")
filtered_data$Date <- format(filtered_data$Date, "%Y")
filtered_data <- distinct(filtered_data)
head(filtered_data)
sampled_data <- filtered_data %>% group_by(Date) %>% slice_sample(n = 1000, replace = FALSE)
nrow(sampled_data)
[1] 15000
head(sampled_data)
selected_features <- c("Location.Description", "Date", "District", "Ward",
"Longitude", "Latitude", "Primary.Type", "Description")
data_cleaned <- sampled_data %>% select(all_of(selected_features))
summary(data_cleaned)
Location.Description Date District Ward Longitude Latitude Primary.Type
Length:15000 Length:15000 Min. : 1.00 Min. : 1.00 Min. :-91.69 Min. :36.62 Length:15000
Class :character Class :character 1st Qu.: 6.00 1st Qu.:11.00 1st Qu.:-87.72 1st Qu.:41.77 Class :character
Mode :character Mode :character Median :10.00 Median :23.00 Median :-87.67 Median :41.86 Mode :character
Mean :11.33 Mean :22.84 Mean :-87.67 Mean :41.84
3rd Qu.:17.00 3rd Qu.:34.00 3rd Qu.:-87.63 3rd Qu.:41.91
Max. :25.00 Max. :50.00 Max. :-87.52 Max. :42.02
Description
Length:15000
Class :character
Mode :character
nrow(data_cleaned)
[1] 15000
#Normalization
normal <-function(x) {
(x -min(x))/(max(x)-min(x))
}
data_cleaned <-as.data.frame(data_cleaned)
#Seperate features and targets
#features <- data_cleaned %>%
# select(-Primary.Type, -Description, -Location.Description, -Date)
features <- data_cleaned %>%
select(-Location.Description, -Date, -Description)
targets <- data_cleaned %>% select(Primary.Type, Description)
data_cleaned[3:6] <- as.data.frame(lapply(data_cleaned[,c(3,4,5,6)], normal))
norm_features <-data_cleaned
#norm_features <- as.data.frame(lapply(data_cleaned[,c(3,4,5,6)], normal))
# Divide data into training and testing split
set.seed(123)
ran <- sample(1:nrow(data_cleaned), 0.99 * nrow(data_cleaned))
#get training stuff
train_features <- norm_features[ran,]
#train_primary_type <- targets$Primary.Type[ran]
#train_description <- targets$Description[ran]
#get testing stuff
test_features <- norm_features[-ran, ]
#test_primary_type <- targets$Primary.Type[-ran]
#test_description <- targets$Description[-ran]
target_category <- data_cleaned[ran,6]
test_category <- data_cleaned[-ran,6]
summary(train_features)
Location.Description Date District Ward Longitude Latitude Primary.Type
Length:14850 Length:14850 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Length:14850
Class :character Class :character 1st Qu.:0.2083 1st Qu.:0.2041 1st Qu.:0.9541 1st Qu.:0.9532 Class :character
Mode :character Mode :character Median :0.3750 Median :0.4490 Median :0.9656 Median :0.9694 Mode :character
Mean :0.4308 Mean :0.4458 Mean :0.9643 Mean :0.9666
3rd Qu.:0.6667 3rd Qu.:0.6735 3rd Qu.:0.9749 3rd Qu.:0.9787
Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
Description
Length:14850
Class :character
Mode :character
nrow(train_features)
[1] 14850
summary(test_features)
Location.Description Date District Ward Longitude Latitude Primary.Type
Length:150 Length:150 Min. :0.0000 Min. :0.0000 Min. :0.9227 Min. :0.9318 Length:150
Class :character Class :character 1st Qu.:0.1667 1st Qu.:0.1684 1st Qu.:0.9551 1st Qu.:0.9513 Class :character
Mode :character Mode :character Median :0.3750 Median :0.4082 Median :0.9671 Median :0.9682 Mode :character
Mean :0.4097 Mean :0.4367 Mean :0.9651 Mean :0.9654
3rd Qu.:0.6250 3rd Qu.:0.6735 3rd Qu.:0.9759 3rd Qu.:0.9781
Max. :1.0000 Max. :1.0000 Max. :0.9974 Max. :0.9983
Description
Length:150
Class :character
Mode :character
nrow(test_features)
[1] 150
library(class)
#Euclidean Distance Function
euclid <- function(a, b) {
sqrt(sum((a - b)^2))
}
#Nearest Neighbors algorithm
nearest_neighbors = function(x, obs, k, dist_func){
dist = apply(x, 1, dist_func, obs) #apply along the rows
distances = sort(dist)[1:k]
neighbor_list = which(dist %in% sort(dist)[1:k])
return(list(neighbor_list, distances))
}
#take last function and create a predicted class based on nearest neighbors
knn_classifier = function(x,y){
groups = table(x[,y])
pred = groups[groups == max(groups)]
return(pred)
}
classification_results <- data.frame(Prediction = character())
x = train_features[1:(nrow(train_features)),]
for (i in 1:nrow(test_features)) {
#observation to be classified
obs = test_features[i,]
nearestNeigbhors = nearest_neighbors(x[,3:6], obs[,3:6],7, euclid)
#print(nearestNeigbhors)
ind = nearestNeigbhors[[1]]
knn_classified = knn_classifier(x[ind,], 'Primary.Type')
print(names(knn_classified))
classification_results <- rbind(classification_results, Prediction = names(knn_classified))
#print(classification_results)
#print(obs[,7])
}
[1] "BATTERY"
[1] "THEFT"
[1] "BURGLARY"
[1] "BATTERY"
[1] "BATTERY"
[1] "THEFT"
[1] "BATTERY"
[1] "CRIMINAL DAMAGE"
[1] "BATTERY"
[1] "BATTERY"
[1] "THEFT"
[1] "THEFT"
[1] "NARCOTICS"
[1] "BURGLARY"
[1] "THEFT"
[1] "BATTERY" "CRIMINAL DAMAGE"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "ASSAULT" "BATTERY" "BURGLARY" "DECEPTIVE PRACTICE"
[5] "GAMBLING" "OFFENSE INVOLVING CHILDREN" "WEAPONS VIOLATION"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 7 of arg 2
[1] "CRIMINAL DAMAGE"
[1] "ASSAULT"
[1] "NARCOTICS"
[1] "BURGLARY" "NARCOTICS"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "NARCOTICS"
[1] "BATTERY"
[1] "BATTERY" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "NARCOTICS"
[1] "BATTERY"
[1] "ASSAULT"
[1] "BURGLARY" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"
[1] "NARCOTICS"
[1] "THEFT"
[1] "BATTERY"
[1] "ASSAULT" "BATTERY" "CRIMINAL DAMAGE" "CRIMINAL TRESPASS" "MOTOR VEHICLE THEFT"
[6] "NARCOTICS" "PUBLIC PEACE VIOLATION"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 7 of arg 2
[1] "BATTERY"
[1] "THEFT"
[1] "NARCOTICS"
[1] "THEFT"
[1] "THEFT"
[1] "BATTERY"
[1] "BATTERY"
[1] "OTHER OFFENSE"
[1] "CRIMINAL DAMAGE" "NARCOTICS"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "NARCOTICS"
[1] "ASSAULT" "CRIMINAL DAMAGE" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 3 of arg 2
[1] "BATTERY" "BURGLARY"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "ASSAULT" "NARCOTICS"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BURGLARY"
[1] "BATTERY" "NARCOTICS"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY" "ROBBERY"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY" "CRIMINAL DAMAGE" "ROBBERY"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 3 of arg 2
[1] "ASSAULT" "OFFENSE INVOLVING CHILDREN" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 3 of arg 2
[1] "THEFT"
[1] "NARCOTICS"
[1] "BURGLARY" "NARCOTICS"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "CRIMINAL DAMAGE"
[1] "ASSAULT" "BATTERY" "BURGLARY" "DECEPTIVE PRACTICE" "HOMICIDE" "OTHER OFFENSE"
[7] "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 7 of arg 2
[1] "BATTERY"
[1] "BURGLARY"
[1] "CRIMINAL DAMAGE" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"
[1] "ASSAULT" "BATTERY"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "THEFT"
[1] "CRIMINAL DAMAGE"
[1] "THEFT"
[1] "THEFT"
[1] "ASSAULT" "BURGLARY" "GAMBLING" "OFFENSE INVOLVING CHILDREN"
[5] "OTHER OFFENSE" "THEFT" "WEAPONS VIOLATION"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 7 of arg 2
[1] "BATTERY"
[1] "CRIMINAL DAMAGE" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "ASSAULT"
[1] "THEFT"
[1] "NARCOTICS"
[1] "THEFT"
[1] "BATTERY"
[1] "THEFT"
[1] "CRIMINAL DAMAGE" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "DECEPTIVE PRACTICE" "HOMICIDE" "MOTOR VEHICLE THEFT" "NARCOTICS" "PROSTITUTION" "ROBBERY"
[7] "SEX OFFENSE"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 7 of arg 2
[1] "WEAPONS VIOLATION"
[1] "CRIMINAL DAMAGE"
[1] "NARCOTICS"
[1] "THEFT"
[1] "PROSTITUTION"
[1] "BATTERY"
[1] "BATTERY"
[1] "THEFT"
[1] "DECEPTIVE PRACTICE" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "OTHER OFFENSE"
[1] "THEFT"
[1] "THEFT"
[1] "NARCOTICS"
[1] "BATTERY" "CRIMINAL DAMAGE" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 3 of arg 2
[1] "CRIMINAL DAMAGE" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "DECEPTIVE PRACTICE"
[1] "BATTERY"
[1] "BATTERY"
[1] "CRIMINAL DAMAGE"
[1] "THEFT"
[1] "BATTERY"
[1] "THEFT"
[1] "NARCOTICS"
[1] "ROBBERY"
[1] "BATTERY" "NARCOTICS"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"
[1] "PROSTITUTION" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"
[1] "BATTERY"
[1] "BATTERY"
[1] "OTHER OFFENSE" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "THEFT"
[1] "BATTERY" "CRIMINAL DAMAGE"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "ASSAULT" "CRIMINAL DAMAGE"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "THEFT"
[1] "ASSAULT" "BATTERY"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY" "BURGLARY"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "CRIMINAL DAMAGE" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "THEFT"
[1] "CRIMINAL DAMAGE"
[1] "ASSAULT"
[1] "CRIMINAL DAMAGE"
[1] "BATTERY" "MOTOR VEHICLE THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"
[1] "THEFT"
[1] "THEFT"
[1] "BATTERY"
[1] "CRIMINAL DAMAGE"
[1] "OTHER OFFENSE"
[1] "BATTERY"
[1] "ASSAULT" "CRIMINAL DAMAGE"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "NARCOTICS"
[1] "THEFT"
[1] "BATTERY"
[1] "BATTERY"
[1] "NARCOTICS"
[1] "BATTERY"
[1] "CRIMINAL DAMAGE" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY"
[1] "THEFT"
[1] "ASSAULT" "BATTERY"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "THEFT"
[1] "NARCOTICS"
[1] "NARCOTICS"
[1] "NARCOTICS" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BATTERY" "CRIMINAL DAMAGE"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "BURGLARY"
[1] "BATTERY"
[1] "ASSAULT" "BATTERY" "CRIMINAL DAMAGE" "MOTOR VEHICLE THEFT" "NARCOTICS"
[6] "PUBLIC PEACE VIOLATION" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 7 of arg 2
[1] "THEFT"
[1] "ASSAULT" "THEFT"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
[1] "NARCOTICS"
[1] "BATTERY"
[1] "BATTERY" "CRIMINAL DAMAGE"
Warning in rbind(deparse.level, ...) :
number of columns of result, 1, is not a multiple of vector length 2 of arg 2
accuracy <- function(x){
sum(diag(x)/(sum(rowSums(x)))) * 100
}
accuracy <- function(x,y){
matches <- x == y
accuracy <- sum(matches) / length(matches) * 100
return(accuracy)
}
#tab <- table(classification_results,obs[1:5,7])
print("Accuracy")
[1] "Accuracy"
print(accuracy(classification_results,test_features[,7]))
[1] 18.66667
#indices based on first taxi dist and then euclid dist
#print off the sub-data frame without class label
#Unused Code, originally planned for further testing, but does not follow the methodology of the research paper.
cosine_distance <- function(a, b) {
distance <- 1 - (sum(a * b) / (sqrt(sum(a^2)) * sqrt(sum(b^2))))
return(distance)
}
encoded_location <- data_cleaned.matrix(~Location.Description - 1, data = data)
encoded_type <- data_cleaned.matrix(~Primary.Type - 1, data = data)